#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Purpose: estimate state-specific shale shocks to analyze time-varying treatment onset

# Load packages
library(cpm)
library(modelsummary)
library(strucchange)
library(tidyverse)
library(tidylog)
library(texreg)
library(here)
library(readxl)
library(kableExtra)
library(lubridate)

#Load custom functions
source(here("code", "fun", "fix_txt.R"))

#For replication
set.seed(10)

#Prepare data ----
#load and clean citygate data
city <- read_xls(here("data", "input", "eia_citygategas", "NG_PRI_SUM_A_EPG0_PG1_DMCF_M.xls"), sheet = 2, skip = 2)
#trim names
names(city) <- gsub("Natural Gas Citygate Price in | \\(Dollars per Thousand Cubic Feet\\)", "", names(city))
# Fix District of Columbia
city <- rename(city, `District of Columbia` = `the District of Columbia`)
# Drop US national row
city <- city[, -2]
# Pivot longer
city_long <- pivot_longer(city, !Date, names_to = "state", values_to = "city_price")

#load and clean electric power gas consumers price data
power <- read_xls(here("data", "input", "eia_electricpowerpricegas", "NG_PRI_SUM_A_EPG0_PEU_DMCF_M.xls"), sheet = 2, skip = 2)
# Trim names
names(power) <- gsub(" Natural Gas Price Sold to Electric Power Consumers \\(Dollars per Thousand Cubic Feet\\)", "", names(power))
# Drop US national row
power <- power[, -2]
# Pivot longer
power_long <- pivot_longer(power, !Date, names_to = "state", values_to = "power_price")
#merge
city_long <- subset(city_long, lubridate::year(Date) >= 2002)#subset to only post 2002 to better evaluate missingness since that's when the power data start at the monthly freq
prices <- full_join(city_long, power_long, by = c("Date", "state"))

# Table E1----
# Estimate correlation between city gate and other gas price data.
m <- list()
m[[1]] <- lm(power_price ~ city_price, prices)
m[[2]] <- lm(power_price ~ city_price + state, prices)
m[[3]] <- lm(power_price ~ city_price + state + factor(lubridate::year(Date)), prices)
m[[4]] <- lm(power_price ~ city_price + state + factor(Date), prices)
#Table formating
gof_map[gof_map$raw=="nobs",]$clean <- "$N$"
gof_map[gof_map$raw=="adj.r.squared",]$clean <- "Adjusted $R^2$"
#Create table
file_citgate <- here("output", "tables", "si_tab_E1_citygate_model.txt")
modelsummary(
  m,
  stars = c("*"=.1,"**"=.05,"***"=.01),
  vcov = ~ state,
  gof_omit = "IC|RMSE|R2$|Log|Std",
  gof_map = gof_map,
  coef_map = c("(Intercept)"="Intercept","city_price"="Citygate Price"),
 output = "latex",
  escape = FALSE,
  col.names = c("", "(1)", "(2)", "(3)", "(4)"),
  add_rows = data.frame(
    c("State Fixed Effects", "Year Fixed Effects", "Month Fixed Effects"),
    c("No", "No", "No"),
    c("Yes", "No", "No"),
    c("Yes", "Yes", "No"),
    c("Yes", "No", "Yes")
  )
) %>%
  add_header_above(c(" "=1,"Outcome: Gas Prices for Electric Utilities"=4)) %>%
  cat(., file = file_citgate)
fix_txt(file_citgate)

# Fig. E.1----
#plot citygate versus power prices
citygatecorplot <- prices %>%
  na.omit() %>%
  mutate(year = lubridate::year(Date)) %>%
  group_by(year, state) %>%
  summarise(
    city_price = mean(city_price, na.rm = TRUE),
    power_price = mean(power_price, na.rm = TRUE)
  ) %>%
  pivot_longer(cols=c(city_price,power_price)) %>%
  mutate(name=ifelse(name=="city_price","Citygate","Electric utilities")) %>%
  ggplot() +
  geom_vline(xintercept = 2008,color="red") +
  stat_smooth(aes(x = year, y = value, group=name, lty=name,color=name), se = TRUE, level = .95, method = "loess") +
  scale_color_manual(values = c("black", "grey60")) +
  theme_bw(base_size = 14) +
  scale_y_continuous(labels=scales::dollar) +
  scale_x_continuous(expand=c(0,0)) +
  labs(x = "Year", y = "Dollars per Thousand Cubic Feet", lty="",color="") +
  theme(
    panel.grid = element_blank(),
    plot.caption.position = "plot",
    plot.caption = element_text(hjust = 0),
    legend.position = c(0.9,.9),
    legend.background = element_rect(fill='transparent'),
    legend.box.background = element_rect(fill='transparent',color="transparent")
  )
ggsave(citygatecorplot, filename = here("output", "figures", "si_fig_E2_citygate.png"), dpi = 300, width = 5.75, height = 3, scale = 1.5)

# Fig E.2----
#plot time series
gas.plot <- prices %>%
  na.omit() %>%
  mutate(year = lubridate::year(Date)) %>%
  group_by(year, state) %>%
  summarise(
    city_price = mean(city_price, na.rm = TRUE),
    power_price = mean(power_price, na.rm = TRUE)
  ) %>%
  ggplot() +
  geom_line(aes(x = year, y = city_price, group = state), color = "gray85") +
  geom_smooth(aes(x = year, y = city_price), color = "dodgerblue4", se = FALSE) +
  geom_vline(xintercept = 2008,color="red") +
  theme_bw(base_size = 14) +
  scale_y_continuous(labels=scales::dollar) +
  scale_x_continuous(expand=c(0,0)) +
  labs(x="Year", y="Citygate Gas Prices")+
  theme(
    panel.grid = element_blank(),
    plot.caption.position = "plot",
    plot.caption = element_text(hjust = 0),
    legend.position = ""
  )
gas.plot
ggsave(gas.plot, filename = here("output", "figures", "si_fig_E2_state_specific.png"), dpi = 300, width = 5.75, height = 3, scale = 1.5)

# Identify State-Specific Structural Breaks ---------------------------------------------------------
# Drop district of Columbia
prices <- subset(prices, state != "District of Columbia")
# Create list of states
state_list <- unique(prices$state)
# Pre-allocate memory
change_point <- vector("list", length(state_list))
# loop over state list
for (i in seq_len(length(state_list))) {
  # Subset data to state
  state <- subset(prices, state == paste0(state_list[[i]]) & !is.na(city_price) & Date > "2004-1-01" & Date < "2020-1-01")
  # Create data frame for structural break analysis
  dat <-
    tidyr::tibble(
      ylag0 = state$city_price,
      ylag1 = lag(state$city_price)
    )
  # Find structural breaks
  qlr <- Fstats(ylag0 ~ ylag1, data = dat)
  # Extract year of breakpoint
  year <- lubridate::year(state[qlr$breakpoint, 1]$Date)
  # Calculate significance value for structural break point
  sig <- sctest(qlr, type = "supF")$p.value
  # Save output
  change_point[[i]] <- cbind("state" = state_list[[i]], "break_year" = year, "sig" = sig)
}
# Create dataframe
break_years <- do.call("rbind", change_point)
break_df <- data.frame(break_years)
# Save output
saveRDS(break_df, here("data", "inter", "breaks.rds"))

# Figure E.3----
state_breaks <- merge(prices, break_df, by = "state", all.x = TRUE)
#drop DC and Alaska since not in main analysis
state_breaks <- subset(state_breaks, state!="Alaska")
state.break.plot <- state_breaks %>%
  mutate(break_year = as.numeric(break_year)) %>%
  mutate(year = lubridate::year(Date)) %>%
  filter(year < 2021 & year >= 2000) %>%
  ggplot() +
  geom_line(aes(x = year, y = city_price, group = state), color = "gray85") +
  geom_smooth(aes(x = year, y = city_price), color = "black", se = FALSE) +
  facet_wrap(~state,nrow = 8, ncol = 7, scales = "free_y") +
  geom_vline(aes(xintercept = break_year), color="red") +
  theme_bw(base_size = 14) +
  labs(x="Year", y="Citygate Gas Prices ($)")+
  scale_x_continuous(breaks = c(2005, 2015)) +
  scale_y_continuous(labels=scales::dollar_format(accuracy=1,prefix=""))+
  theme(
    panel.grid = element_blank(),
    plot.caption.position = "plot",
    plot.caption = element_text(hjust = 0),
    legend.position = ""
  )
ggsave(
  state.break.plot,
  filename = here("output", "figures", "si_fig_E3_statebreakplot.png"),
  width = 6.5,
  height = 6.5,
  scale = 1.75
  )
